# 
knitr::opts_chunk$set(echo = TRUE, comment = NA)
library(DT)
library(xtable)
library(dplyr)
library(ggplot2)
library(plotly)
#convert date to a Date variable
mydata <- data.frame(read.csv("activity.csv", stringsAsFactors = FALSE))
mydata$date <- as.Date(mydata$date)

Introduction

This assignment used data collected each day, at five minute intervals over a 2 month period with a personal activity monitoring device.


Part 1

The mean, sum and, median number of steps taken per day

Here, missing values in the steps column were excluded. The rest of the data was grouped by date and the daily mean, total number of steps and, median values for each day were returned These values are shown below as column values of summarized data for each day. The total number of observations are 17568


# Remove all observations with missing values
mydata <- na.omit(mydata)
#
    DataSummary <- mydata %>% 
          group_by(date) %>% 
          summarize( MeanSteps = round(mean(steps), 2), 
                        SumSteps = sum(steps), 
                        MedianSteps = median(steps))
knitr::kable(head(DataSummary, 10), caption = ("The mean, total number of steps and median steps per day"))
The mean, total number of steps and median steps per day
date MeanSteps SumSteps MedianSteps
2012-10-02 0.44 126 0
2012-10-03 39.42 11352 0
2012-10-04 42.07 12116 0
2012-10-05 46.16 13294 0
2012-10-06 53.54 15420 0
2012-10-07 38.25 11015 0
2012-10-09 44.48 12811 0
2012-10-10 34.38 9900 0
2012-10-11 35.78 10304 0
2012-10-12 60.35 17382 0

Histogram of total number of steps taken each day

gplot1 <- ggplot(data=DataSummary, aes(SumSteps)) + 
    geom_histogram(alpha = .5, col="blue", 
                 aes(fill=..count..)) + 
      labs(title="Histogram of Total Number of Steps Per Day",
       x = "Total number of steps taken each day")
return(ggplotly(gplot1))

Average Daily Activity Pattern

Below, is a time series plot of daily activities with a vertical line was drawn at five minute interval with maximum number of steps.

#
maxint <- max(mydata$steps)
gplot2 <- ggplot(data=mydata, aes(x=interval, y=steps)) + 
            geom_point() +
              geom_vline(xintercept = maxint, linetype="dotted", 
                color = "red", size=1.5) +
          labs(title="Time series plot of steps vs intervals",
          x = "Intervals")
return(ggplotly(gplot2))

The five minute interval with maximum number of steps across the data set is 806


Part 2

Activity data with missing values imputed

#Load data again and count the number of observations with missing values

mydataM <- data.frame(read.csv("activity.csv", stringsAsFactors = FALSE))
mydataM$date <- as.Date(mydataM$date)
missingData <- sum(is.na(mydataM))

The number of observations with missing values is 2304

The mising values are only in the steps taken column. We will replace each missing value with the mean value of that column and verify that new dataset has zero missing values afterwards.

mydataM$steps[is.na(mydataM$steps)] <- mean(mydataM$steps, na.rm=TRUE)
missingDataM <- sum(is.na(mydataM))

After imputing mising values, the number of observations with missing values is 0

The total number of observations with missing values replaced is 17568


The mean, sum and, median number of steps each day with missing values imputed

    DataSum <- mydataM %>% 
          group_by(date) %>% 
          summarize(M_Steps = round(mean(steps), 2), 
                        S_Steps = round(sum(steps), 2),
                        MedSteps = round(median(steps), 2))
knitr::kable(head(DataSum, 10), caption = ("The Mean, Total and Median steps per day"))
The Mean, Total and Median steps per day
date M_Steps S_Steps MedSteps
2012-10-01 37.38 10766.19 37.38
2012-10-02 0.44 126.00 0.00
2012-10-03 39.42 11352.00 0.00
2012-10-04 42.07 12116.00 0.00
2012-10-05 46.16 13294.00 0.00
2012-10-06 53.54 15420.00 0.00
2012-10-07 38.25 11015.00 0.00
2012-10-08 37.38 10766.19 37.38
2012-10-09 44.48 12811.00 0.00
2012-10-10 34.38 9900.00 0.00
#
gplot3 <- ggplot(data=DataSum, aes(M_Steps)) + 
  geom_histogram(alpha = .5, col="blue", 
                 aes(fill=..count..)) + 
    labs(title="Histogram of the total steps without missing values imputed",
       x = "Total number of steps taken each day")

return(ggplotly(gplot3))

The results show that replacing all 2304 missing values with mean values has the effect of increasing overall values (overall mean, total number of steps and, median) compared to previous values with missing observation removed.


Part 3

Possible differences in activity pattern during weekdays and weekends

   mydataM$date <- as.Date(mydataM$date)
#create a factor variable 'WDays' of weekdays and weekends
WeekDays <- c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday')
mydataM$WDays <- factor((weekdays(mydataM$date) %in% WeekDays), 
         levels=c(FALSE, TRUE), labels=c('Weekend', 'Weekday') )
#
table(mydataM$WDays)

Weekend Weekday 
   4608   12960 

A time series of data for weekdays and weekends

    ggplot(data = mydataM) +
         geom_point( mapping = aes(x=interval, y = steps) ) +
                  facet_wrap(~WDays) 


Filter data for weekdays and weekends

WeekEnd  <- mydataM %>% 
     filter(WDays == 'Weekend') 
#
     ggplot(data = WeekEnd) +
         geom_line( mapping = aes(x=interval, y = steps) ) +
       labs(title="WeekEnd")

WeekDay  <- mydataM %>% 
        group_by(date, interval, WDays) %>%
        filter(WDays == 'Weekday')  
#
 ggplot(data = WeekDay) +
    geom_line( mapping = aes(x=interval, y = steps) ) +
   labs(title="WeekDay")